#### 144_Renuka_Asane
Extra_Link - https://colab.research.google.com/drive/1uiZIGT3aNRXx0Eto81l_-VxYUOFXQlEe
Assignment No - 1 Simple Linear Regression¶
In [ ]:
#STEP-1: Import Libraries
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
WARNING:root:pydrive is deprecated and no longer maintained. We recommend that you migrate your projects to pydrive2, the maintained fork of pydrive
In [ ]:
#STEP-2: Autheticate E-Mail ID
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
In [ ]:
#installation of python libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Load dataset
data = pd.read_csv('SLR-Data.csv')
print(data.shape)
print(data.head())
#print(data.describe())
# Collecting X and Y
X = data['No of Hours Spent During(X)'].values
Y = data['Risk Score on a scale of 0-100(Y)'].values
# Calculate Mean X and Y
mean_x = np.mean(X)
mean_y = np.mean(Y)
#print(mean_x)
#print(mean_y)
# Total number of values
m = len(X)
# Using the formula to calculate b1(slope) and b0(intercept)
numer = 0
denom = 0
for i in range(m):
numer += (X[i] - mean_x) * (Y[i] - mean_y)
denom += (X[i] - mean_x) ** 2
b1 = numer / denom
b0 = mean_y - (b1 * mean_x)
# Print coefficients:b1,b0
print("Slope,Intercept:",b1,b0)
# Plotting Values and Regression Line
max_x = np.max(X)
min_x = np.min(X)
# Calculating line values x and y
x = np.linspace(min_x, max_x)
y = b0 + b1 * x
# Ploting Line
plt.plot(x, y, color='green', label='Regression Line')
# Ploting Scatter Points
plt.scatter(X, Y, c='blue', label='Scatter Plot')
plt.xlabel('No of Hours Spent During')
plt.ylabel('Risk Score on a scale of 0-100')
plt.legend()
plt.show()
#For Calculating Root Mean Squares Error
rmse = 0
for i in range(m):
y_pred = b0 + b1 * X[i]
rmse += (Y[i] - y_pred) ** 2
rmse = np.sqrt(rmse/m)
print("Root Mean Squares Error:",rmse)
# Calculating Accuracy Score
ss_t = 0
ss_r = 0
for i in range(m):
y_pred = b0 + b1 * X[i]
ss_t += (Y[i] - mean_y) ** 2
ss_r += (Y[i] - y_pred) ** 2
r2 = 1 - (ss_r/ss_t)
print("Accuracy:",r2*100)
#predicting a o/p (y) for new value of x
predict_x=int(input('Enter No Hours Spent in Driving:'))
predict_y=(4.58789861*predict_x)+12.584627964022907
plt.scatter(X,Y)
plt.scatter(predict_x,predict_y)
plt.xlabel('No Hours Spent Driving(Predicted_x)')
plt.ylabel('Risk Score on a Scale of 0-100(Predicted_y)')
#plotting the regression line
plt.scatter(X, Y, c='blue')
plt.plot(x, y, color='green')
# function to show plot
plt.show()
(8, 2) No of Hours Spent During(X) Risk Score on a scale of 0-100(Y) 0 10 95 1 9 80 2 2 10 3 15 50 4 10 45 Slope,Intercept: 4.58789860997547 12.584627964022893
Root Mean Squares Error: 22.759716640449565 Accuracy: 43.709481451010035 Enter No Hours Spent in Driving:3
Assignment No - 2 Principle Component Analysis¶
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
%matplotlib inline
In [ ]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
df = pd.read_csv(url
, names=['sepal length','sepal width','petal length','petal width','target'])
df.head()
Out[ ]:
| sepal length | sepal width | petal length | petal width | target | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
In [ ]:
features = ['sepal length', 'sepal width', 'petal length', 'petal width']
x = df.loc[:, features].values
In [ ]:
y = df.loc[:,['target']].values
In [ ]:
x = StandardScaler().fit_transform(x)
In [ ]:
pd.DataFrame(data = x, columns = features).head()
Out[ ]:
| sepal length | sepal width | petal length | petal width | |
|---|---|---|---|---|
| 0 | -0.900681 | 1.032057 | -1.341272 | -1.312977 |
| 1 | -1.143017 | -0.124958 | -1.341272 | -1.312977 |
| 2 | -1.385353 | 0.337848 | -1.398138 | -1.312977 |
| 3 | -1.506521 | 0.106445 | -1.284407 | -1.312977 |
| 4 | -1.021849 | 1.263460 | -1.341272 | -1.312977 |
In [ ]:
#PCA Projection to 2D
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1', 'principal component 2'])
principalDf.head(5)
Out[ ]:
| principal component 1 | principal component 2 | |
|---|---|---|
| 0 | -2.264542 | 0.505704 |
| 1 | -2.086426 | -0.655405 |
| 2 | -2.367950 | -0.318477 |
| 3 | -2.304197 | -0.575368 |
| 4 | -2.388777 | 0.674767 |
In [ ]:
df[['target']].head()
Out[ ]:
| target | |
|---|---|
| 0 | Iris-setosa |
| 1 | Iris-setosa |
| 2 | Iris-setosa |
| 3 | Iris-setosa |
| 4 | Iris-setosa |
In [ ]:
finalDf = pd.concat([principalDf, df[['target']]], axis = 1)
finalDf.head(5)
Out[ ]:
| principal component 1 | principal component 2 | target | |
|---|---|---|---|
| 0 | -2.264542 | 0.505704 | Iris-setosa |
| 1 | -2.086426 | -0.655405 | Iris-setosa |
| 2 | -2.367950 | -0.318477 | Iris-setosa |
| 3 | -2.304197 | -0.575368 | Iris-setosa |
| 4 | -2.388777 | 0.674767 | Iris-setosa |
In [ ]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)
targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for target, color in zip(targets,colors):
indicesToKeep = finalDf['target'] == target
ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
, finalDf.loc[indicesToKeep, 'principal component 2']
, c = color
, s = 50)
ax.legend(targets)
ax.grid()
In [ ]:
#Explained Variance:The explained variance tells us how much information (variance)
#can be attributed to each of the principal components.
pca.explained_variance_ratio_
Out[ ]:
array([0.72770452, 0.23030523])
Assignment 3 Decision Tree¶
In [ ]:
downloaded = drive.CreateFile({'id':'1jql2mwV15BCFeX52G1PGSCr8Y4jLdn8f'}) # replace the id with id of file you want to access
downloaded.GetContentFile('DT-Data.csv')
In [ ]:
#import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#reading Dataset
dataset=pd.read_csv("DT-Data.csv")
X=dataset.iloc[:,:-1]
y=dataset.iloc[:,5].values
#Perform Label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X = X.apply(LabelEncoder().fit_transform)
print (X)
from sklearn.tree import DecisionTreeClassifier
regressor=DecisionTreeClassifier()
regressor.fit(X.iloc[:,1:5],y)
#Predict value for the given expression
X_in=np.array([0,1,0,1])
y_pred=regressor.predict([X_in])
print ("Prediction:", y_pred)
from six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
# Create DOT data
dot_data = StringIO()
export_graphviz(regressor, out_file=dot_data, filled=True, rounded=True, special_characters=True)
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('Decision_Tree.png')
# Show graph
Image(graph.create_png())
id age income gender marital_status 0 0 1 0 1 1 1 1 1 0 1 0 2 2 0 0 1 1 3 3 2 2 1 1 4 4 2 1 0 1 5 5 2 1 0 0 6 6 0 1 0 0 7 7 1 2 1 1 8 8 1 1 0 0 9 9 2 2 0 1 10 10 1 2 0 0 11 11 0 2 1 0 12 12 0 0 0 1 13 13 2 2 1 0 Prediction: ['Yes']
/usr/local/lib/python3.10/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names warnings.warn(
Out[ ]:
OR¶
In [ ]:
# Load dataset
data = pd.read_csv('data.csv')
In [ ]:
#import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#reading Dataset
dataset=pd.read_csv("data.csv")
X=dataset.iloc[:,:-1]
y=dataset.iloc[:,5].values
#Perform Label encoding
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X = X.apply(LabelEncoder().fit_transform)
print (X)
from sklearn.tree import DecisionTreeClassifier
regressor=DecisionTreeClassifier()
regressor.fit(X.iloc[:,1:5],y)
#Predict value for the given expression
X_in=np.array([0,1,0,1])
y_pred=regressor.predict([X_in])
print ("Prediction:", y_pred)
from six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
# Create DOT data
dot_data = StringIO()
export_graphviz(regressor, out_file=dot_data, filled=True, rounded=True, special_characters=True)
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
#graph.write_png('Decision_Tree.png')
# Show graph
Image(graph.create_png())
footfall tempMode AQ USS CS VOC RP IP Temperature 0 0 7 6 0 5 6 17 2 0 1 71 1 2 2 4 1 1 3 0 2 29 7 1 1 5 1 5 5 0 3 58 4 2 3 4 1 9 5 0 4 90 7 4 5 3 0 49 5 0 .. ... ... .. ... .. ... .. .. ... 939 0 7 6 0 5 4 54 5 23 940 0 7 4 1 5 6 31 5 23 941 0 3 5 1 6 5 24 5 23 942 0 6 5 1 4 6 27 6 23 943 18 7 3 1 5 3 42 6 23 [944 rows x 9 columns] Prediction: [5]
/usr/local/lib/python3.10/dist-packages/sklearn/base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names warnings.warn(
dot: graph is too large for cairo-renderer bitmaps. Scaling by 0.784556 to fit
Out[ ]:
Assignment no 4 Naive Bayes¶
In [ ]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
In [ ]:
#from sklearn.datasets import load_iris
#dataset = load_iris()
#dataset
downloaded = drive.CreateFile({'id':'12BY34aCbYLoLjy3gDUMrZEBUf7l5FZsd'}) # replace the id with id of file you want to access
downloaded.GetContentFile('iris.csv')
dataset=pd.read_csv("iris.csv")
dataset
#https://drive.google.com/file/d/12BY34aCbYLoLjy3gDUMrZEBUf7l5FZsd/view?usp=share_link
Out[ ]:
| sepal.length | sepal.width | petal.length | petal.width | variety | |
|---|---|---|---|---|---|
| 0 | 5.1 | 3.5 | 1.4 | 0.2 | Setosa |
| 1 | 4.9 | 3.0 | 1.4 | 0.2 | Setosa |
| 2 | 4.7 | 3.2 | 1.3 | 0.2 | Setosa |
| 3 | 4.6 | 3.1 | 1.5 | 0.2 | Setosa |
| 4 | 5.0 | 3.6 | 1.4 | 0.2 | Setosa |
| ... | ... | ... | ... | ... | ... |
| 145 | 6.7 | 3.0 | 5.2 | 2.3 | Virginica |
| 146 | 6.3 | 2.5 | 5.0 | 1.9 | Virginica |
| 147 | 6.5 | 3.0 | 5.2 | 2.0 | Virginica |
| 148 | 6.2 | 3.4 | 5.4 | 2.3 | Virginica |
| 149 | 5.9 | 3.0 | 5.1 | 1.8 | Virginica |
150 rows × 5 columns
In [ ]:
#Spliting the dataset in independent and dependent variables
X = dataset.iloc[:,:4].values
y = dataset['variety'].values
In [ ]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 82)
In [ ]:
# Fitting Naive Bayes Classification to the Training set with linear kernel
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
In [ ]:
# Fitting Naive Bayes Classification to the Training set with linear kernel
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
In [ ]:
# Fitting Naive Bayes Classification to the Training set with linear kernel
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
In [ ]:
# Calculate the confusion matrix and assign it to 'cm'
cm = confusion_matrix(y_test, y_pred)
In [ ]:
#finding accuracy from the confusion matrix.
a = cm.shape
corrPred = 0
falsePred = 0
for row in range(a[0]):
for c in range(a[1]):
if row == c:
corrPred +=cm[row,c]
else:
falsePred += cm[row,c]
print('Correct predictions: ', corrPred)
print('False predictions', falsePred)
print ('\n\nAccuracy of the Naive Bayes Clasification is: ', corrPred/(cm.sum()))
Correct predictions: 28 False predictions 2 Accuracy of the Naive Bayes Clasification is: 0.9333333333333333
Assignment no - 5 SVM¶
In [ ]:
# Load the important packages
from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.svm import SVC
In [ ]:
# Load the datasets
cancer = load_breast_cancer()
X = cancer.data[:, :2]
y = cancer.target
In [ ]:
#Build the model
svm = SVC(kernel="linear")
# Trained the model
svm.fit(X, y)
# Plot Decision Boundary
DecisionBoundaryDisplay.from_estimator(
svm,
X,
response_method="predict",
cmap=plt.cm.Spectral,
alpha=0.8,
xlabel=cancer.feature_names[0],
ylabel=cancer.feature_names[1],
)
# Scatter plot
plt.scatter(X[:, 0], X[:, 1],
c=y,
s=20, edgecolors="k")
plt.show()
In [ ]:
#Build the model
svm = SVC(kernel="rbf", gamma=0.5, C=1.0)
# Trained the model
svm.fit(X, y)
# Plot Decision Boundary
DecisionBoundaryDisplay.from_estimator(
svm,
X,
response_method="predict",
cmap=plt.cm.Spectral,
alpha=0.8,
xlabel=cancer.feature_names[0],
ylabel=cancer.feature_names[1],
)
# Scatter plot
plt.scatter(X[:, 0], X[:, 1],
c=y,
s=20, edgecolors="k")
plt.show()
Assignment 6 - K-Means¶
In [ ]:
#import packages
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
#create dataset using DataFrame
df=pd.DataFrame({'X':[0.1,0.15,0.08,0.16,0.2,0.25,0.24,0.3],
'y':[0.6,0.71,0.9,0.85,0.3,0.5,0.1,0.2]})
f1 = df['X'].values
f2 = df['y'].values
X = np.array(list(zip(f1, f2)))
print(X)
#centroid points
C_x=np.array([0.1,0.3])
C_y=np.array([0.6,0.2])
centroids=C_x,C_y
#plot the given points
colmap = {1: 'r', 2: 'b'}
plt.scatter(f1, f2, color='k')
plt.show()
#for i in centroids():
plt.scatter(C_x[0],C_y[0], color=colmap[1])
plt.scatter(C_x[1],C_y[1], color=colmap[2])
plt.show()
C = np.array(list((C_x, C_y)), dtype=np.float32)
print (C)
#plot given elements with centroid elements
plt.scatter(f1, f2, c='#050505')
print("point No.6[0.25,0.5] is belongs to blue cluster(cluster no:2)")
plt.scatter(C_x[0], C_y[0], marker='*', s=200, c='r')
plt.scatter(C_x[1], C_y[1], marker='*', s=200, c='b')
plt.show()
#import KMeans class and create object of it
from sklearn.cluster import KMeans
model=KMeans(n_clusters=2,random_state=0)
model.fit(X)
labels=model.labels_
print(labels)
#using labels find population around centroid
count=0
for i in range(len(labels)):
if (labels[i]==1):
count=count+1
print('No of population around cluster 2:',count-1)
#Find new centroids
new_centroids = model.cluster_centers_
print('Previous value of m1 and m2 is:')
print('M1==',centroids[0])
print('M1==',centroids[1])
print('Updated value of m1 and m2 is:')
print('M1==',new_centroids[0])
print('M1==',new_centroids[1])
[[0.1 0.6 ] [0.15 0.71] [0.08 0.9 ] [0.16 0.85] [0.2 0.3 ] [0.25 0.5 ] [0.24 0.1 ] [0.3 0.2 ]]
[[0.1 0.3] [0.6 0.2]] point No.6[0.25,0.5] is belongs to blue cluster(cluster no:2)
[1 1 1 1 0 0 0 0] No of population around cluster 2: 3 Previous value of m1 and m2 is: M1== [0.1 0.3] M1== [0.6 0.2] Updated value of m1 and m2 is: M1== [0.2475 0.275 ] M1== [0.1225 0.765 ]
In [ ]:
#STEP-1: Import Libraries
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
In [ ]:
#STEP-2: Autheticate E-Mail ID
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
In [ ]:
#STEP-3: Get File from Drive using file-ID
downloaded = drive.CreateFile({'id':'1e10Ynfgrc35FtMl2V5qpzTGyuWF4KQsZ'}) # replace the id with id of file you want to access
downloaded.GetContentFile('hw.csv')
Assignment 7 - Gradient Boost Classifier¶
In [ ]:
# Import all relevant libraries
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing
import warnings
warnings.filterwarnings("ignore")
In [ ]:
#STEP-1: Import Libraries
# Code to read csv file into colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
In [ ]:
#STEP-2: Autheticate E-Mail ID
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
In [ ]:
# Get File from Drive using file-ID
downloaded = drive.CreateFile({'id':'1zI-X3zdiuM9u74zQyKIShvAUtPJQ7jUK'}) # replace the id with id of file you want to access
downloaded.GetContentFile('income_evaluation.csv')
# https://drive.google.com/file/d/1zI-X3zdiuM9u74zQyKIShvAUtPJQ7jUK/view?usp=sharing (Dataset Downloads Link)
In [ ]:
#Now let’s read the dataset and look at the columns to understand the information better.
#https://drive.google.com/file/d/1zI-X3zdiuM9u74zQyKIShvAUtPJQ7jUK/view?usp=sharing
df = pd.read_csv('income_evaluation.csv')
df.head()
Out[ ]:
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
In [ ]:
df.shape
Out[ ]:
(32561, 15)
In [ ]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 32561 entries, 0 to 32560 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 32561 non-null int64 1 workclass 32561 non-null object 2 fnlwgt 32561 non-null int64 3 education 32561 non-null object 4 education-num 32561 non-null int64 5 marital-status 32561 non-null object 6 occupation 32561 non-null object 7 relationship 32561 non-null object 8 race 32561 non-null object 9 sex 32561 non-null object 10 capital-gain 32561 non-null int64 11 capital-loss 32561 non-null int64 12 hours-per-week 32561 non-null int64 13 native-country 32561 non-null object 14 income 32561 non-null object dtypes: int64(6), object(9) memory usage: 3.7+ MB
In [ ]:
df.isnull().sum()
Out[ ]:
| 0 | |
|---|---|
| age | 0 |
| workclass | 0 |
| fnlwgt | 0 |
| education | 0 |
| education-num | 0 |
| marital-status | 0 |
| occupation | 0 |
| relationship | 0 |
| race | 0 |
| sex | 0 |
| capital-gain | 0 |
| capital-loss | 0 |
| hours-per-week | 0 |
| native-country | 0 |
| income | 0 |
In [ ]:
#df.drop(columns=' fnlwgt',inplace=True)
df.columns
Out[ ]:
Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
' marital-status', ' occupation', ' relationship', ' race', ' sex',
' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
' income'],
dtype='object')
In [ ]:
X = df.drop(columns=' income')
y = df[' income']
In [ ]:
from sklearn.preprocessing import LabelEncoder
In [ ]:
def label_encoder(a):
le = LabelEncoder()
df[a] = le.fit_transform(df[a])
In [ ]:
df.columns
Out[ ]:
Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
' marital-status', ' occupation', ' relationship', ' race', ' sex',
' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
' income'],
dtype='object')
In [ ]:
label_list = [' workclass', ' education',' marital-status',
' occupation', ' relationship', ' race', ' sex',' native-country', ' income']
In [ ]:
label_list = [' workclass', ' education',' marital-status',
' occupation', ' relationship', ' race', ' sex',' native-country', ' income']
In [ ]:
df.head()
Out[ ]:
| age | workclass | fnlwgt | education | education-num | marital-status | occupation | relationship | race | sex | capital-gain | capital-loss | hours-per-week | native-country | income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
| 1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
| 2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
| 3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
| 4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
In [ ]:
# Assuming 'df' is your DataFrame
# 1. Identify categorical features
categorical_features = df.select_dtypes(include=['object']).columns
# 2. Create a LabelEncoder for each categorical feature
label_encoders = {}
for feature in categorical_features:
label_encoders[feature] = LabelEncoder()
df[feature] = label_encoders[feature].fit_transform(df[feature])
# 3. Now proceed with your model training
X = df.drop([' income'],axis=1).values # independant features
y = df[' income'].values
In [ ]:
print("X_train shape:",X_train.shape)
print("y_test shape:",y_test.shape)
print("X_test shape:",X_test.shape)
print("y_train shape:",y_train.shape)
X_train shape: (26048, 14) y_test shape: (6513,) X_test shape: (6513, 14) y_train shape: (26048,)
In [ ]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
In [ ]:
# Choose your test size to split between training and testing sets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
gradient_booster = GradientBoostingClassifier(learning_rate=0.1)
accuracies = cross_val_score(gradient_booster, X_train, y_train, cv=5)
gradient_booster.fit(X_train,y_train)
print("Train Score:",np.mean(accuracies))
print("Test Score:",gradient_booster.score(X_test,y_test))
Train Score: 0.8647881398244366 Test Score: 0.8708736373407032
In [ ]:
result_dict_train = {}
result_dict_test = {}
result_dict_train["Gradient-Boost Default Train Score"] = np.mean(accuracies)
result_dict_test["Gradient-Boost Default Test Score"] = gradient_booster.score(X_test,y_test)
In [ ]:
grid = {
'learning_rate':[0.01,0.05,0.1],
'n_estimators':np.arange(100,500,100),
}
gb = GradientBoostingClassifier()
gb_cv = GridSearchCV(gb, grid, cv = 4)
gb_cv.fit(X_train,y_train)
print("Best Parameters:",gb_cv.best_params_)
print("Train Score:",gb_cv.best_score_)
print("Test Score:",gb_cv.score(X_test,y_test))
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 400}
Train Score: 0.8711993243243243
Test Score: 0.8777828957469676
In [ ]:
result_dict_train
result_dict_test
Out[ ]:
{'Gradient-Boost Default Test Score': 0.8708736373407032}